package com.geccocrawler.gecco.spider;

import com.geccocrawler.gecco.GeccoEngine;
import com.geccocrawler.gecco.downloader.AfterDownload;
import com.geccocrawler.gecco.downloader.BeforeDownload;
import com.geccocrawler.gecco.downloader.DownloadException;
import com.geccocrawler.gecco.downloader.Downloader;
import com.geccocrawler.gecco.pipeline.Pipeline;
import com.geccocrawler.gecco.request.HttpRequest;
import com.geccocrawler.gecco.response.HttpResponse;
import com.geccocrawler.gecco.scheduler.Scheduler;
import com.geccocrawler.gecco.scheduler.UniqueSpiderScheduler;
import com.geccocrawler.gecco.spider.render.Render;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import java.util.List;
import java.util.concurrent.CountDownLatch;

/**
 * 一个爬虫引擎可以包含多个爬虫，每个爬虫可以认为是一个单独线程，爬虫会从Scheduler中获取需要待抓取的请求。
 * 爬虫的任务就是下载网页并渲染相应的JavaBean。
 *
 * @author huchengyi
 */
public class Spider implements Runnable {

    private static Log log = LogFactory.getLog(Spider.class);

    private CountDownLatch pauseCountDown;

    private volatile boolean stop;

    private volatile boolean pause;

    private GeccoEngine engine;

    private Scheduler spiderScheduler;

    /**
     * 当前待渲染的bean
     */
    public Class<? extends SpiderBean> currSpiderBeanClass;

    public Spider(GeccoEngine engine) {
        this.engine = engine;
        this.spiderScheduler = new UniqueSpiderScheduler();
        this.pause = false;
        this.stop = false;
    }

    @Override
    public void run() {
        //将spider放入线程本地变量，之后需要使用
        SpiderThreadLocal.set(this);
        while (true) {
            //停止
            if (stop) {
                //告知engine线程执行结束
                engine.notifyComplete();
                break;
            }
            //暂停抓取
            if (pause) {
                try {
                    this.pauseCountDown.await();
                } catch (InterruptedException e) {
                    log.error("can't pause : ", e);
                }
            }
            //获取待抓取的url
            boolean start = false;
            HttpRequest request = spiderScheduler.out();
            if (request == null) {
                //startScheduler
                request = engine.getScheduler().out();
                if (request == null) {
                    //告知engine线程执行结束
                    engine.notifyComplete();
                    break;
                }
                start = true;
            }
            if (log.isDebugEnabled()) {
                log.debug("match url : " + request.getUrl());
            }
            //匹配SpiderBean
            currSpiderBeanClass = engine.getSpiderBeanFactory().matchSpider(request);
            //download
            HttpResponse response = null;
            try {
                if (currSpiderBeanClass == null) {//如果无法匹配但是是302跳转，需要放入抓取队列继续抓取
                    response = defaultDownload(request);
                    if (response.getStatus() == 302 || response.getStatus() == 301) {
                        spiderScheduler.into(request.subRequest(response.getContent()));
                    } else {
                        log.error("cant't match url : " + request.getUrl());
                    }
                } else {
                    //获取SpiderBean的上下文：downloader,beforeDownloader,afterDownloader,render,pipelines
                    SpiderBeanContext context = getSpiderBeanContext();

                    final int retry = engine.getRetry();

                    response = getResponse(request, response, context, retry);

                    if (response != null) {
                        if (response.getStatus() == 200) {
                            //render
                            Render render = context.getRender();

                            SpiderBean spiderBean = null;
                            spiderBean = render.inject(currSpiderBeanClass, request, response);

                            //pipelines
                            pipelines(spiderBean, context);
                        } else if (response.getStatus() == 302 || response.getStatus() == 301) {
                            spiderScheduler.into(request.subRequest(response.getContent()));
                        }
                    }
                }
            } catch (Exception ex) {
                if (engine.isDebug()) {
                    log.error(request.getUrl() + " ERROR : ", ex);
                }
                log.error(request.getUrl() + " ERROR : " + ex.getClass().getName() + "||" + ex.getMessage());
            } finally {
                if (response != null) {
                    response.close();
                }
            }
            //抓取间隔
            interval();
            //开始地址放入队尾重新抓取
            if (start && engine.isLoop()) {
                //如果是一个开始抓取请求，再返回开始队列中
                engine.getScheduler().into(request);
            }
        }
    }

    private HttpResponse getResponse(HttpRequest request, HttpResponse response, SpiderBeanContext context, int retry) {

        for (int i = 0; i < retry; i++) {
            try {
                response = download(context, request);
                if (response != null) {
                    break;
                }
            } catch (Exception e) {
                log.error(request.getUrl() + " ERROR" + i + ": " + e.getClass().getName() + "||" + e.getMessage());
                if (i + 1 != retry) {
                    try {
                        log.error("休息一下[" + engine.getInterval() * (i + 1) + "]");
                        Thread.sleep(engine.getInterval() * (i + 1));

                    } catch (InterruptedException ex) {
                        throw new RuntimeException(ex);
                    }
                }
            }
        }

        return response;
    }


    /**
     * 暂停，当前正在抓取的请求会继续抓取完成，之后会等到restart的调用才继续抓取
     */
    public void pause() {
        this.pauseCountDown = new CountDownLatch(1);
        this.pause = true;
    }

    /**
     * 重新开始
     */
    public void restart() {
        this.pauseCountDown.countDown();
        this.pause = false;
    }

    /**
     * 停止抓取
     */
    public void stop() {
        this.stop = true;
    }

    @SuppressWarnings({"rawtypes", "unchecked"})
    private void pipelines(SpiderBean spiderBean, SpiderBeanContext context) {
        if (spiderBean == null) {
            return;
        }
        List<Pipeline> pipelines = context.getPipelines();
        if (pipelines != null) {
            for (Pipeline pipeline : pipelines) {
                pipeline.process(spiderBean);
            }
        }
    }

    private void interval() {
        int interval = engine.getInterval();
        if (interval > 0) {
            try {
                Thread.sleep(randomInterval(interval));
            } catch (InterruptedException e) {
            }
        }
    }

    /**
     * 默认下载
     *
     * @param request
     * @return
     */
    private HttpResponse defaultDownload(HttpRequest request) throws DownloadException {
        HttpResponse response = download(null, request);
        return response;
    }

    private HttpResponse download(SpiderBeanContext context, HttpRequest request) throws DownloadException {
        log.info(request.getUrl());
        Downloader currDownloader = null;
        BeforeDownload before = null;
        AfterDownload after = null;
        int timeout = 1000;
        if (context != null) {
            currDownloader = context.getDownloader();
            before = context.getBeforeDownload();
            after = context.getAfterDownload();
            timeout = context.getTimeout();
        } else {
            currDownloader = engine.getSpiderBeanFactory().getDownloaderFactory().defaultDownloader();
        }
        if (before != null) {
            before.process(request);
        }

        //设置全局cookie
        String[] engineCookies = engine.getCookies();
        if (engineCookies != null && engineCookies.length > 0) {
            request.addHeader("Cookie", StringUtils.join(engineCookies, ","));
        }

        HttpResponse response = currDownloader.download(request, timeout);

        if (after != null) {
            after.process(request, response);
        }
        return response;
    }

    /**
     * 间隔时间在左右1s的范围内随机
     *
     * @param interval
     * @return
     */
    private int randomInterval(int interval) {
        int min = interval - 1000;
        if (min < 1) {
            min = 1;
        }
        int max = interval + 1000;
        return (int) Math.rint(Math.random() * (max - min) + min);
    }

    public GeccoEngine getEngine() {
        return engine;
    }

    public Scheduler getSpiderScheduler() {
        return spiderScheduler;
    }

    public void setSpiderScheduler(Scheduler spiderScheduler) {
        this.spiderScheduler = spiderScheduler;
    }

    public SpiderBeanContext getSpiderBeanContext() {
        return engine.getSpiderBeanFactory().getContext(currSpiderBeanClass);
    }
}
